Finger exercise

Visualizing the World:

WHO <- read.csv("WHO.csv")
str(WHO)
## 'data.frame':    194 obs. of  13 variables:
##  $ Country                      : Factor w/ 194 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Region                       : Factor w/ 6 levels "Africa","Americas",..: 3 4 1 4 1 2 2 4 6 4 ...
##  $ Population                   : int  29825 3162 38482 78 20821 89 41087 2969 23050 8464 ...
##  $ Under15                      : num  47.4 21.3 27.4 15.2 47.6 ...
##  $ Over60                       : num  3.82 14.93 7.17 22.86 3.84 ...
##  $ FertilityRate                : num  5.4 1.75 2.83 NA 6.1 2.12 2.2 1.74 1.89 1.44 ...
##  $ LifeExpectancy               : int  60 74 73 82 51 75 76 71 82 81 ...
##  $ ChildMortality               : num  98.5 16.7 20 3.2 163.5 ...
##  $ CellularSubscribers          : num  54.3 96.4 99 75.5 48.4 ...
##  $ LiteracyRate                 : num  NA NA NA NA 70.1 99 97.8 99.6 NA NA ...
##  $ GNI                          : num  1140 8820 8310 NA 5230 ...
##  $ PrimarySchoolEnrollmentMale  : num  NA NA 98.2 78.4 93.1 91.1 NA NA 96.9 NA ...
##  $ PrimarySchoolEnrollmentFemale: num  NA NA 96.4 79.4 78.2 84.5 NA NA 97.5 NA ...
plot(WHO$GNI, WHO$FertilityRate)
library(ggplot2)

scatterplot = ggplot(WHO, aes(x = GNI, y = FertilityRate))
scatterplot + geom_point()
## Warning: Removed 35 rows containing missing values (geom_point).

scatterplot + geom_point(color = "blue", size = 3, shape=17)
## Warning: Removed 35 rows containing missing values (geom_point).

fertilityGNIplot = scatterplot + geom_point(color = "darkred", size = 3, shape=8) + ggtitle("Fertility rate vs. GNI")
# save in file
pdf("Myplot.pdf")
print(fertilityGNIplot)
## Warning: Removed 35 rows containing missing values (geom_point).
# back to terminal
dev.off()
## quartz_off_screen 
##                 2
scatterplot + geom_point(color = "blue", size = 3, shape=15)
## Warning: Removed 35 rows containing missing values (geom_point).

ggplot(WHO,aes(x=GNI,y=FertilityRate,color=Region))+geom_point()
## Warning: Removed 35 rows containing missing values (geom_point).

ggplot(WHO,aes(x=GNI,y=FertilityRate,color=LifeExpectancy))+geom_point()
## Warning: Removed 35 rows containing missing values (geom_point).

ggplot(WHO,aes(x=FertilityRate,y=Under15))+geom_point()
## Warning: Removed 11 rows containing missing values (geom_point).

ggplot(WHO,aes(x=log(FertilityRate),y=Under15))+geom_point() + stat_smooth(method="lm",se=FALSE,color="orange")
## Warning: Removed 11 rows containing missing values (stat_smooth).
## Warning: Removed 11 rows containing missing values (geom_point).

model = lm(Under15~log(FertilityRate),data = WHO)
summary(model)
## 
## Call:
## lm(formula = Under15 ~ log(FertilityRate), data = WHO)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3131  -1.7742   0.0446   1.7440   7.7174 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.6540     0.4478   17.09   <2e-16 ***
## log(FertilityRate)  22.0547     0.4175   52.82   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 181 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.9391, Adjusted R-squared:  0.9387 
## F-statistic:  2790 on 1 and 181 DF,  p-value: < 2.2e-16
ggplot(WHO, aes(x = FertilityRate, y = Under15, color=Region)) + geom_point()+scale_color_brewer(palette="Dark2") 
## Warning: Removed 11 rows containing missing values (geom_point).

The analytical Policeman

mvt = read.csv("mvt.csv",stringsAsFactor = FALSE)
str(mvt)
## 'data.frame':    191641 obs. of  3 variables:
##  $ Date     : chr  "12/31/12 23:15" "12/31/12 22:00" "12/31/12 22:00" "12/31/12 22:00" ...
##  $ Latitude : num  41.8 41.9 42 41.8 41.8 ...
##  $ Longitude: num  -87.6 -87.7 -87.8 -87.7 -87.6 ...
mvt$Date = strptime(mvt$Date,format="%m/%d/%y %H:%M")
mvt$Weekday = weekdays(mvt$Date) 
mvt$Hour = mvt$Date$hour
table(mvt$Weekday)
## 
##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
##     29284     27397     27118     26316     27319     26791     27416
WeekdayCounts = as.data.frame(table(mvt$Weekday))
str(WeekdayCounts)
## 'data.frame':    7 obs. of  2 variables:
##  $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7
##  $ Freq: int  29284 27397 27118 26316 27319 26791 27416
library(ggplot2)
# lineplot 
WeekdayCounts$Var1 = factor(WeekdayCounts$Var1,ordered = TRUE,levels=c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"))
ggplot(WeekdayCounts,aes(x=Var1,y=Freq))+geom_line(aes(group=1),linetype = 2)+xlab("Day of the Week")+ylab("Total Motorvehicle Thefts")

ggplot(WeekdayCounts,aes(x=Var1,y=Freq))+geom_line(aes(group=1),alpha = 0.3)+xlab("Day of the Week")+ylab("Total Motorvehicle Thefts")

# heat map 
table(mvt$Weekday,mvt$Hour)
##            
##                0    1    2    3    4    5    6    7    8    9   10   11
##   Friday    1873  932  743  560  473  602  839 1203 1268 1286  938  822
##   Monday    1900  825  712  527  415  542  772 1123 1323 1235  971  737
##   Saturday  2050 1267  985  836  652  508  541  650  858 1039  946  789
##   Sunday    2028 1236 1019  838  607  461  478  483  615  864  884  787
##   Thursday  1856  816  696  508  400  534  799 1135 1298 1301  932  731
##   Tuesday   1691  777  603  464  414  520  845 1118 1175 1174  948  786
##   Wednesday 1814  790  619  469  396  561  862 1140 1329 1237  947  763
##            
##               12   13   14   15   16   17   18   19   20   21   22   23
##   Friday    1207  857  937 1140 1165 1318 1623 1652 1736 1881 2308 1921
##   Monday    1129  824  958 1059 1136 1252 1518 1503 1622 1815 2009 1490
##   Saturday  1204  767  963 1086 1055 1084 1348 1390 1570 1702 2078 1750
##   Sunday    1192  789  959 1037 1083 1160 1389 1342 1706 1696 2079 1584
##   Thursday  1093  752  831 1044 1131 1258 1510 1537 1668 1776 2134 1579
##   Tuesday   1108  762  908 1071 1090 1274 1553 1496 1696 1816 2044 1458
##   Wednesday 1225  804  863 1075 1076 1289 1580 1507 1718 1748 2093 1511
DayHourCounts = as.data.frame(table(mvt$Weekday,mvt$Hour))
str(DayHourCounts)
## 'data.frame':    168 obs. of  3 variables:
##  $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7 1 2 3 ...
##  $ Var2: Factor w/ 24 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ Freq: int  1873 1900 2050 2028 1856 1691 1814 932 825 1267 ...
DayHourCounts$Hour = as.numeric(as.character(DayHourCounts$Var2))
str(DayHourCounts)
## 'data.frame':    168 obs. of  4 variables:
##  $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7 1 2 3 ...
##  $ Var2: Factor w/ 24 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ Freq: int  1873 1900 2050 2028 1856 1691 1814 932 825 1267 ...
##  $ Hour: num  0 0 0 0 0 0 0 1 1 1 ...
ggplot(DayHourCounts,aes(x=Hour,y=Freq))+geom_line(aes(group=Var1,color=Var1),size=2)

DayHourCounts$Var1 = factor(DayHourCounts$Var1,ordered=TRUE, levels=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
ggplot(DayHourCounts,aes(x = Hour,y=Var1))+geom_tile(aes(fill=Freq))+scale_fill_gradient(name="Total MV Theft",low="white",high="red") + theme(axis.title.y = element_blank())

#install.packages("maps")
#install.packages("ggmap")
library(maps)
library(ggmap)

chicago = get_map(location="chicago",zoom=11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=chicago&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=chicago&sensor=false
ggmap(chicago) + geom_point(data = mvt[1:100,],aes(x=Longitude,y=Latitude))
## Warning: Removed 7 rows containing missing values (geom_point).

LatLonCounts = as.data.frame(table(round(mvt$Longitude,2),round(mvt$Latitude,2)))
str(LatLonCounts)
## 'data.frame':    1638 obs. of  3 variables:
##  $ Var1: Factor w/ 42 levels "-87.93","-87.92",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Var2: Factor w/ 39 levels "41.64","41.65",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq: int  0 0 0 0 0 0 0 0 0 0 ...
#convert factor to numeric 
LatLonCounts$Long = as.numeric(as.character(LatLonCounts$Var1))
LatLonCounts$Lat = as.numeric(as.character(LatLonCounts$Var2))
ggmap(chicago) + geom_point(data = LatLonCounts, aes(x = Long, y = Lat, color = Freq, size=Freq)) + scale_colour_gradient(low="yellow", high="red")
## Warning: Removed 615 rows containing missing values (geom_point).

ggmap(chicago) + geom_tile(data = LatLonCounts,aes(x=Long,y=Lat,alpha = Freq),fill = "red")

LatLonCounts2 = subset(LatLonCounts,Freq>0)
ggmap(chicago) + geom_tile(data = LatLonCounts2,aes(x=Long,y=Lat,alpha = Freq),fill = "red")

ggmap(chicago) + geom_point(data = LatLonCounts2, aes(x = Long, y = Lat, color = Freq, size=Freq)) + scale_colour_gradient(low="yellow", high="red")
## Warning: Removed 119 rows containing missing values (geom_point).

A Heatmap on the United States

murders = read.csv("murders.csv")
str(murders)
## 'data.frame':    51 obs. of  6 variables:
##  $ State            : Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Population       : int  4779736 710231 6392017 2915918 37253956 5029196 3574097 897934 601723 19687653 ...
##  $ PopulationDensity: num  94.65 1.26 57.05 56.43 244.2 ...
##  $ Murders          : int  199 31 352 130 1811 117 131 48 131 987 ...
##  $ GunMurders       : int  135 19 232 93 1257 65 97 38 99 669 ...
##  $ GunOwnership     : num  0.517 0.578 0.311 0.553 0.213 0.347 0.167 0.255 0.036 0.245 ...
statesMap=map_data("state")
str(statesMap)
## 'data.frame':    15537 obs. of  6 variables:
##  $ long     : num  -87.5 -87.5 -87.5 -87.5 -87.6 ...
##  $ lat      : num  30.4 30.4 30.4 30.3 30.3 ...
##  $ group    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ order    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region   : chr  "alabama" "alabama" "alabama" "alabama" ...
##  $ subregion: chr  NA NA NA NA ...
ggplot(statesMap,aes(x=long, y = lat, group=group)) +geom_polygon(fill = "white",color = "black")

murders$region=tolower(murders$State)
murderMap = merge(statesMap,murders,by ="region")
str(murderMap)
## 'data.frame':    15537 obs. of  12 variables:
##  $ region           : chr  "alabama" "alabama" "alabama" "alabama" ...
##  $ long             : num  -87.5 -87.5 -87.5 -87.5 -87.6 ...
##  $ lat              : num  30.4 30.4 30.4 30.3 30.3 ...
##  $ group            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ order            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ subregion        : chr  NA NA NA NA ...
##  $ State            : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Population       : int  4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 ...
##  $ PopulationDensity: num  94.7 94.7 94.7 94.7 94.7 ...
##  $ Murders          : int  199 199 199 199 199 199 199 199 199 199 ...
##  $ GunMurders       : int  135 135 135 135 135 135 135 135 135 135 ...
##  $ GunOwnership     : num  0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 ...
ggplot(murderMap,aes(x=long,y=lat,group=group,fill=Murders))+geom_polygon(color="black")+scale_fill_gradient(low="black",high="red",guide="legend")

ggplot(murderMap,aes(x=long,y=lat,group=group,fill=Population))+geom_polygon(color="black")+scale_fill_gradient(low="black",high="red",guide="legend")

murderMap$MurderRate = murderMap$Murders/murderMap$Population*100000
ggplot(murderMap,aes(x=long,y=lat,group=group,fill=MurderRate))+geom_polygon(color="black")+scale_fill_gradient(low="black",high="red",guide="legend",limits=c(0,10))

#fill each state with the variable GunOwnership
murderMap$GunOwnershipRate = murderMap$GunOwnership/murderMap$Population*100000
ggplot(murderMap,aes(x=long,y=lat,group=group,fill=GunOwnershipRate))+geom_polygon(color="black")+scale_fill_gradient(low="black",high="red",guide="legend")

Assignment 7

7.1 Election Forecasting Revisited

In the recitation from Unit 3, we used logistic regression on polling data in order to construct US presidential election predictions. We separated our data into a training set, containing data from 2004 and 2008 polls, and a test set, containing the data from 2012 polls. We then proceeded to develop a logistic regression model to forecast the 2012 US presidential election.

In this homework problem, we’ll revisit our logistic regression model from Unit 3, and learn how to plot the output on a map of the United States. Unlike what we did in the Crime lecture, this time we’ll be plotting predictions rather than data! ### Drawing a map of the United States

library(ggplot2)
library(maps)
library(ggmap)
statesMap = map_data("state")
str(statesMap)
## 'data.frame':    15537 obs. of  6 variables:
##  $ long     : num  -87.5 -87.5 -87.5 -87.5 -87.6 ...
##  $ lat      : num  30.4 30.4 30.4 30.3 30.3 ...
##  $ group    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ order    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region   : chr  "alabama" "alabama" "alabama" "alabama" ...
##  $ subregion: chr  NA NA NA NA ...
#How many different groups are there? 
table(statesMap$group)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##  202  149  312  516   79   91   94   10  872  381  233  329  257  256  113 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30 
##  397  650  399  566   36  220   30  460  370  373  382  315  238  208   70 
##   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45 
##  125  205   78   16  290   21  168   37  733   12  105  238  284  236  172 
##   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60 
##   66  304  166  289 1088   59  129   96   15  623   17   17   19   44  448 
##   61   62   63 
##  373  388   68
ggplot(statesMap, aes(x = long, y = lat, group = group)) + geom_polygon(fill = "white", color = "black") 

### Coloring the States by Predictions

polling<-read.csv("PollingImputed.csv")
str(polling)
## 'data.frame':    145 obs. of  7 variables:
##  $ State     : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 2 2 3 3 3 4 4 4 ...
##  $ Year      : int  2004 2008 2004 2008 2004 2008 2012 2004 2008 2012 ...
##  $ Rasmussen : int  11 21 19 16 5 5 8 7 10 13 ...
##  $ SurveyUSA : int  18 25 21 18 15 3 5 5 7 21 ...
##  $ DiffCount : int  5 5 1 6 8 9 4 8 5 2 ...
##  $ PropR     : num  1 1 1 1 1 ...
##  $ Republican: int  1 1 1 1 1 1 1 1 1 1 ...
Train <- subset(polling,Year == 2004|Year ==2008 )
Test <- subset(polling,Year ==2012)
nrow(Train)+nrow(Test)== nrow(polling)
## [1] TRUE
mod2 = glm(Republican~SurveyUSA + DiffCount, data = Train, family = "binomial")
TestPrediction = predict(mod2,newdata = Test, type = "response")
TestPredictionBinary = as.numeric(TestPrediction>0.5)
predictionDataFrame= data.frame(TestPrediction,TestPredictionBinary,Test$State)
str(predictionDataFrame)
## 'data.frame':    45 obs. of  3 variables:
##  $ TestPrediction      : num  9.74e-01 9.99e-01 9.26e-05 9.43e-03 3.43e-05 ...
##  $ TestPredictionBinary: num  1 1 0 0 0 1 1 0 1 0 ...
##  $ Test.State          : Factor w/ 50 levels "Alabama","Alaska",..: 3 4 5 6 7 9 10 11 12 13 ...
#For how many states is our binary prediction 1 (for 2012), corresponding to Republican?
table(predictionDataFrame$TestPredictionBinary)
## 
##  0  1 
## 23 22
#What is the average predicted probability of our model (on the Test set, for 2012)?
summary(predictionDataFrame$TestPrediction)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000001 0.0000926 0.0648700 0.4853000 0.9986000 0.9999000
predictionDataFrame$region = tolower(predictionDataFrame$Test.State)
predictionMap <- merge(statesMap,predictionDataFrame,by="region")
#make sure the observations are in order so that the map is drawn properly,
predictionMap <- predictionMap[order(predictionMap$order),]
str(predictionMap)
## 'data.frame':    15034 obs. of  9 variables:
##  $ region              : chr  "arizona" "arizona" "arizona" "arizona" ...
##  $ long                : num  -115 -115 -115 -115 -115 ...
##  $ lat                 : num  35 35.1 35.1 35.2 35.2 ...
##  $ group               : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ order               : int  204 205 206 207 208 209 210 211 212 213 ...
##  $ subregion           : chr  NA NA NA NA ...
##  $ TestPrediction      : num  0.974 0.974 0.974 0.974 0.974 ...
##  $ TestPredictionBinary: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Test.State          : Factor w/ 50 levels "Alabama","Alaska",..: 3 3 3 3 3 3 3 3 3 3 ...
str(statesMap) # only make prediction for 45 states, so there are less observation in the predictionMap than statesMap
## 'data.frame':    15537 obs. of  6 variables:
##  $ long     : num  -87.5 -87.5 -87.5 -87.5 -87.6 ...
##  $ lat      : num  30.4 30.4 30.4 30.3 30.3 ...
##  $ group    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ order    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region   : chr  "alabama" "alabama" "alabama" "alabama" ...
##  $ subregion: chr  NA NA NA NA ...
# color the map with predictions
ggplot(predictionMap,aes(x = long, y = lat, group = group, fill = TestPredictionBinary )) + geom_polygon(color="black")

# replot with discrete outcome and use red (republican) and blue (democrates)
ggplot(predictionMap,aes(x = long, y = lat, group = group, fill = TestPredictionBinary )) + geom_polygon(color="black") + scale_fill_gradient(low = "blue", high = "red", guide = "legend", breaks= c(0,1), labels = c("Democrat", "Republican"), name = "Prediction 2012")

# plot the probabilities instead of the binary predictions. 
ggplot(predictionMap,aes(x = long, y = lat, group = group, fill = TestPrediction )) + geom_polygon(color="black") + scale_fill_gradient(low = "blue", high = "red", guide = "legend", breaks= c(0,1), labels = c("Democrat", "Republican"), name = "Prediction 2012")

# What was our predicted probability for the state of Florida (that was incorrectly predicted to be Republican)
predictionDataFrame$TestPrediction[predictionDataFrame$region =="florida"]
## [1] 0.9640395
ggplot(predictionMap,aes(x = long, y = lat, group = group, fill = TestPredictionBinary )) + geom_polygon(color="black",linetype=3) + scale_fill_gradient(low = "blue", high = "red", guide = "legend", breaks= c(0,1), labels = c("Democrat", "Republican"), name = "Prediction 2012")

ggplot(predictionMap,aes(x = long, y = lat, group = group, fill = TestPredictionBinary )) + geom_polygon(color="black",size=3) + scale_fill_gradient(low = "blue", high = "red", guide = "legend", breaks= c(0,1), labels = c("Democrat", "Republican"), name = "Prediction 2012")

ggplot(predictionMap,aes(x = long, y = lat, group = group, fill = TestPredictionBinary )) + geom_polygon(color="black",alpha=0.3) + scale_fill_gradient(low = "blue", high = "red", guide = "legend", breaks= c(0,1), labels = c("Democrat", "Republican"), name = "Prediction 2012")

7.2 Visualizing Network Data

The cliche goes that the world is an increasingly interconnected place, and the connections between different entities are often best represented with a graph. Graphs are comprised of vertices (also often called “nodes”) and edges connecting those nodes. In this assignment, we will learn how to visualize networks using the igraph package in R.

For this assignment, we will visualize social networking data using anonymized data from Facebook; this data was originally curated in a recent paper about computing social circles in social networks. In our visualizations, the vertices in our network will represent Facebook users and the edges will represent these users being Facebook friends with each other.

edges<- read.csv("edges.csv")
users<-read.csv("users.csv")
str(users)
## 'data.frame':    59 obs. of  4 variables:
##  $ id    : int  3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 ...
##  $ gender: Factor w/ 3 levels "","A","B": 2 3 3 3 3 3 2 3 3 2 ...
##  $ school: Factor w/ 3 levels "","A","AB": 2 1 1 1 1 2 1 1 2 1 ...
##  $ locale: Factor w/ 3 levels "","A","B": 3 3 3 3 3 3 2 3 3 2 ...
str(edges)
## 'data.frame':    146 obs. of  2 variables:
##  $ V1: int  4019 4023 4023 4027 3988 3982 3994 3998 3993 3982 ...
##  $ V2: int  4026 4031 4030 4032 4021 3986 3998 3999 3995 4021 ...
#How many Facebook users are there in our dataset?
nrow(users)
## [1] 59
#what is the average number of friends per user?
friends = c(7,13,1,0,5,8,1,6,5,3,2,2,5,10,8,3,3,10,13,3,8,1,6,4,9,2,1,3,0,9,0,3,1,5,11,0,3,8,6,7,7,10,0,17,0,3,8,6,1,1,18,10,1,2,1,0,1,3,8)
sum(friends)/nrow(users)
## [1] 4.949153
avgFriends = nrow(edges)*2 / nrow(users)
#Out of all the students who listed a school, what was the most common locale?
summary(users)
##        id       gender school  locale
##  Min.   : 594    : 2     :40    : 3  
##  1st Qu.:3994   A:15   A :17   A: 6  
##  Median :4009   B:42   AB: 2   B:50  
##  Mean   :3952                        
##  3rd Qu.:4024                        
##  Max.   :4038
#Is it possible that either school A or B is an all-girls or all-boys school?

#install.packages("igraph")
library(igraph)
g = graph.data.frame(edges,FALSE,users)
str(g)
## IGRAPH UN-- 59 146 -- 
## + attr: name (v/c), gender (v/c), school (v/c), locale (v/c)
## + edges (vertex names):
## 3981 -- 3994, 3997, 3998, 4009, 4018, 4019, 4023
## 3982 -- 3986, 3988, 3994, 3997, 3998, 4003, 4009, 4014, 4021,
##         4023, 4026, 4030, 4037
## 3983 -- 4017
## 3984 --
## 3985 -- 3988, 3993, 3995, 4004, 4014
## 3986 -- 3982, 4000, 4014, 4017, 4021, 4026, 4030, 4033
## 3987 -- 4012
## 3988 -- 3982, 3985, 3993, 3995, 4021, 4030
## 3989 -- 3991, 594, 4011, 4013, 4038
## 3990 -- 4007, 4016, 4025
## 3991 -- 3989, 4031
## 3992 -- 4000, 4017
## 3993 -- 3985, 3988, 3995, 4004, 4030
## 3994 -- 3981, 3982, 3996, 3997, 3998, 4009, 4018, 4019, 4023, 4030
## 3995 -- 3985, 3988, 3993, 4000, 4004, 4014, 4023, 4026
## 594 -- 3989, 4011, 4031
## 3996 -- 3994, 4002, 4028
## 3997 -- 3981, 3982, 3994, 3998, 4009, 4018, 4019, 4021, 4023, 4030
## 3998 -- 3981, 3982, 3994, 3997, 3999, 4002, 4005, 4009, 4014,
##         4018, 4019, 4021, 4023
## 3999 -- 3998, 4005, 4036
## 4000 -- 3986, 3992, 3995, 4017, 4021, 4026, 4030, 4033
## 4001 -- 4029
## 4002 -- 3996, 3998, 4020, 4023, 4027, 4031
## 4003 -- 3982, 4009, 4023, 4030
## 4004 -- 3985, 3993, 3995, 4013, 4020, 4023, 4030, 4031, 4038
## 4005 -- 3998, 3999
## 4006 -- 4027
## 4007 -- 3990, 4016, 4025
## 4008 --
## 4009 -- 3981, 3982, 3994, 3997, 3998, 4003, 4019, 4023, 4030
## 4010 --
## 4011 -- 3989, 594, 4031
## 4012 -- 3987
## 4013 -- 3989, 4004, 4023, 4031, 4038
## 4014 -- 3982, 3985, 3986, 3995, 3998, 4017, 4021, 4023, 4030,
##         4037, 4038
## 4015 --
## 4016 -- 3990, 4007, 4025
## 4017 -- 3983, 3986, 3992, 4000, 4014, 4021, 4026, 4030
## 4018 -- 3981, 3994, 3997, 3998, 4023, 4030
## 4019 -- 3981, 3994, 3997, 3998, 4009, 4026, 4030
## 4020 -- 4002, 4004, 4027, 4030, 4031, 4037, 4038
## 4021 -- 3982, 3986, 3988, 3997, 3998, 4000, 4014, 4017, 4026, 4030
## 4022 --
## 4023 -- 3981, 3982, 3994, 3995, 3997, 3998, 4002, 4003, 4004,
##         4009, 4013, 4014, 4018, 4030, 4031, 4034, 4038
## 4024 --
## 4025 -- 3990, 4007, 4016
## 4026 -- 3982, 3986, 3995, 4000, 4017, 4019, 4021, 4030
## 4027 -- 4002, 4006, 4020, 4031, 4032, 4038
## 4028 -- 3996
## 4029 -- 4001
## 4030 -- 3982, 3986, 3988, 3993, 3994, 3997, 4000, 4003, 4004,
##         4009, 4014, 4017, 4018, 4019, 4020, 4021, 4023, 4026
## 4031 -- 3991, 594, 4002, 4004, 4011, 4013, 4020, 4023, 4027, 4038
## 4032 -- 4027
## 4033 -- 3986, 4000
## 4034 -- 4023
## 4035 --
## 4036 -- 3999
## 4037 -- 3982, 4014, 4020
## 4038 -- 3989, 4004, 4013, 4014, 4020, 4023, 4027, 4031
#get.data.frame(g, what=c("both"))
plot(g,vertex.size=5,vertex.label = NA)

# change size of the vertices 
V(g)$size=degree(g)/2+2
plot(g,vertex.label=NA)

V(g)$size
##  [1]  5.5  8.5  2.5  2.0  4.5  6.0  2.5  5.0  4.5  3.5  3.0  3.0  4.5  7.0
## [15]  6.0  3.5  3.5  7.0  8.5  3.5  6.0  2.5  5.0  4.0  6.5  3.0  2.5  3.5
## [29]  2.0  6.5  2.0  3.5  2.5  4.5  7.5  2.0  3.5  6.0  5.0  5.5  5.5  7.0
## [43]  2.0 10.5  2.0  3.5  6.0  5.0  2.5  2.5 11.0  7.0  2.5  3.0  2.5  2.0
## [57]  2.5  3.5  6.0
summary(V(g)$size)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.500   3.500   4.475   6.000  11.000
#When we created our graph g, we provided it with the data frame users, which had variables gender, school, and locale. These are now stored as attributes V(g)$gender, V(g)$school, and V(g)$locale.
# Now color the vertices based on the gender of the user, 
V(g)$color = "black"
V(g)$color[V(g)$gender == "A"] = "red"
V(g)$color[V(g)$gender=="B"] = "grey"
plot(g,vertex.label = NA)

#Now, color the vertices based on the school that each user in our network attended.
V(g)$color = "black"
V(g)$color[V(g)$school == "A"] = "blue"
V(g)$color[V(g)$school == "AB"] = "red"
plot(g,vertex.label = NA)

#Now, color the vertices based on the locale of the user.
V(g)$color = "black"
V(g)$color[V(g)$locale == "A"] = "blue"
V(g)$color[V(g)$locale == "B"] = "red"
plot(g,vertex.label = NA)

7.3 Visualizing Text Data Using Word CLouds

Earlier in the course, we used text analytics as a predictive tool, using word frequencies as independent variables in our models. However, sometimes our goal is to understand commonly occurring topics in text data instead of to predict the value of some dependent variable. In such cases, word clouds can be a visually appealing way to display the most frequent words in a body of text.A word cloud arranges the most common words in some text, using size to indicate the frequency of a word. While we could generate word clouds using free generators available on the Internet, we will have more flexibility and control over the process if we do so in R. We will visualize the text of tweets about Apple, a dataset we used earlier in the course.

tweets<-read.csv("tweets.csv",stringsAsFactor = FALSE)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
corpus <- Corpus(VectorSource(tweets))
corpus<-tm_map(corpus,tolower)
corpus = tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus,removePunctuation)
corpus <- tm_map(corpus,removeWords,stopwords("english"))
dtm <- DocumentTermMatrix(corpus)
dtm
## <<DocumentTermMatrix (documents: 2, terms: 3780)>>
## Non-/sparse entries: 3780/3780
## Sparsity           : 50%
## Maximal term length: 115
## Weighting          : term frequency (tf)
allTweets <- as.data.frame(as.matrix(dtm))
str(allTweets)
## 'data.frame':    2 obs. of  3780 variables:
##  $ 000                                                                                                                : num  1 0
##  $ 075                                                                                                                : num  3 0
##  $ 0909                                                                                                               : num  1 0
##  $ 0910                                                                                                               : num  1 0
##  $ 099                                                                                                                : num  1 0
##  $ 100                                                                                                                : num  2 0
##  $ 100m                                                                                                               : num  1 0
##  $ 1085                                                                                                               : num  3 0
##  $ 10min                                                                                                              : num  1 0
##  $ 110                                                                                                                : num  1 0
##  $ 13apple                                                                                                            : num  1 0
##  $ 13th                                                                                                               : num  1 0
##  $ 1415                                                                                                               : num  1 0
##  $ 16gb                                                                                                               : num  1 0
##  $ 16gbs                                                                                                              : num  1 0
##  $ 180                                                                                                                : num  1 0
##  $ 18092013                                                                                                           : num  1 0
##  $ 18th                                                                                                               : num  2 0
##  $ 199                                                                                                                : num  3 0
##  $ 1am                                                                                                                : num  1 0
##  $ 1jazzyjeff                                                                                                         : num  1 0
##  $ 1st                                                                                                                : num  1 0
##  $ 200                                                                                                                : num  2 0
##  $ 2000ad                                                                                                             : num  1 0
##  $ 2001                                                                                                               : num  1 0
##  $ 2002                                                                                                               : num  1 0
##  $ 2004                                                                                                               : num  1 0
##  $ 2005                                                                                                               : num  2 0
##  $ 2011with                                                                                                           : num  1 0
##  $ 2013                                                                                                               : num  4 0
##  $ 2014                                                                                                               : num  3 0
##  $ 20th                                                                                                               : num  4 0
##  $ 211                                                                                                                : num  1 0
##  $ 21st                                                                                                               : num  1 0
##  $ 22000                                                                                                              : num  1 0
##  $ 22nd                                                                                                               : num  1 0
##  $ 244tsuyoponzu                                                                                                      : num  6 0
##  $ 2nd                                                                                                                : num  2 0
##  $ 2shaneez                                                                                                           : num  1 0
##  $ 2week                                                                                                              : num  1 0
##  $ 300                                                                                                                : num  2 0
##  $ 30aud                                                                                                              : num  1 0
##  $ 30mins                                                                                                             : num  1 0
##  $ 320k                                                                                                               : num  1 0
##  $ 350                                                                                                                : num  1 0
##  $ 3gs                                                                                                                : num  1 0
##  $ 3rd                                                                                                                : num  1 0
##  $ 3yr                                                                                                                : num  1 0
##  $ 40000                                                                                                              : num  1 0
##  $ 40k                                                                                                                : num  1 0
##  $ 40mb                                                                                                               : num  1 0
##  $ 4eva                                                                                                               : num  1 0
##  $ 500                                                                                                                : num  1 0
##  $ 528                                                                                                                : num  1 0
##  $ 549                                                                                                                : num  2 0
##  $ 550                                                                                                                : num  1 0
##  $ 55cs                                                                                                               : num  1 0
##  $ 55mb                                                                                                               : num  1 0
##  $ 5cheap                                                                                                             : num  2 0
##  $ 5s5c                                                                                                               : num  2 0
##  $ 5same                                                                                                              : num  1 0
##  $ 5sc                                                                                                                : num  2 0
##  $ 5sdo                                                                                                               : num  1 0
##  $ 5slfw                                                                                                              : num  1 0
##  $ 5th                                                                                                                : num  2 0
##  $ 600                                                                                                                : num  1 0
##  $ 6000                                                                                                               : num  1 0
##  $ 64bit                                                                                                              : num  4 0
##  $ 65min                                                                                                              : num  1 0
##  $ 6monthlifespan                                                                                                     : num  1 0
##  $ 719                                                                                                                : num  1 0
##  $ 740                                                                                                                : num  1 0
##  $ 777                                                                                                                : num  1 0
##  $ 7evenstarz                                                                                                         : num  7 0
##  $ 7pm                                                                                                                : num  1 0
##  $ 7wow                                                                                                               : num  1 0
##  $ 86m                                                                                                                : num  1 0
##  $ 884                                                                                                                : num  1 0
##  $ 899                                                                                                                : num  1 0
##  $ 8bitsound                                                                                                          : num  1 0
##  $ 900                                                                                                                : num  1 0
##  $ 917                                                                                                                : num  1 0
##  $ 99preorder                                                                                                         : num  1 0
##  $ a7s                                                                                                                : num  1 0
##  $ aaaaaapple                                                                                                         : num  1 0
##  $ aapl                                                                                                               : num  2 0
##  $ abrsm                                                                                                              : num  1 0
##  $ absolutely                                                                                                         : num  4 0
##  $ abt                                                                                                                : num  2 0
##  $ acampan                                                                                                            : num  1 0
##  $ acceptable                                                                                                         : num  1 0
##  $ access                                                                                                             : num  4 0
##  $ accidentally                                                                                                       : num  2 0
##  $ acciones                                                                                                           : num  3 0
##  $ according                                                                                                          : num  1 0
##  $ accordingtonina                                                                                                    : num  1 0
##  $ account                                                                                                            : num  4 0
##  $ accurate                                                                                                           : num  1 0
##  $ ace                                                                                                                : num  1 0
##   [list output truncated]
ncol(unique(allTweets))
## [1] 3780
#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
#Which function can we apply to allTweets to get a vector of the words in our dataset, which we'll pass as the first argument to wordcloud()?
words<- colnames(allTweets)
#Which function should we apply to allTweets to obtain the frequency of each word across all tweets?
freq<- colSums(allTweets)
wordcloud(words,freq,scale=c(2, 0.25))

# remove "apple" (the most frequent word) from the corpus
wordToRemove = c("apple")
corpusNoApple<-tm_map(corpus,removeWords,wordToRemove)
dtmNoApple <- DocumentTermMatrix(corpusNoApple)
dtmNoApple
## <<DocumentTermMatrix (documents: 2, terms: 3779)>>
## Non-/sparse entries: 3779/3779
## Sparsity           : 50%
## Maximal term length: 115
## Weighting          : term frequency (tf)
allTweetsNoApple <- as.data.frame(as.matrix(dtmNoApple))
wordsNoApple<- colnames(allTweetsNoApple)
# check if 'apple' is still in the list 
'apple' %in% wordsNoApple
## [1] FALSE
# new data frame without 'apple'
freqNoApple<- colSums(allTweetsNoApple)
wordcloud(wordsNoApple,freqNoApple,scale=c(2, 0.25), random.order=FALSE,min.freq=3,max.words=Inf)

# changing min.freq, max.words 
wordcloud(wordsNoApple,freqNoApple,scale=c(2, 0.25), random.order=FALSE,min.freq=10,max.words=100)

# chaning rot.per
wordcloud(wordsNoApple,freqNoApple,scale=c(2, 0.25), random.order=FALSE,rot.per=0.9)

# setting colors using brewer.pal 
wordcloud(wordsNoApple,freqNoApple,scale=c(2, 0.25), random.order=FALSE, rot.per=0.1,random.color=FALSE,colors=brewer.pal(9, "Blues")[c(5, 6, 7, 8, 9)])

# same as above 
#wordcloud(wordsNoApple,freqNoApple,scale=c(2, 0.25), random.order=FALSE, rot.per=0.1,random.color=FALSE,colors=brewer.pal(9, "Blues")[c(-1, -2, -3, -4)])